In [2]:
pip install -U ydata-profiling
Requirement already satisfied: ydata-profiling in /opt/anaconda3/lib/python3.12/site-packages (4.12.1) Requirement already satisfied: scipy<1.14,>=1.4.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (1.13.1) Requirement already satisfied: pandas!=1.4.0,<3,>1.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (2.2.2) Requirement already satisfied: matplotlib<3.10,>=3.5 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (3.9.2) Requirement already satisfied: pydantic>=2 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (2.8.2) Requirement already satisfied: PyYAML<6.1,>=5.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (6.0.1) Requirement already satisfied: jinja2<3.2,>=2.11.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (3.1.4) Requirement already satisfied: visions<0.7.7,>=0.7.5 in /opt/anaconda3/lib/python3.12/site-packages (from visions[type_image_path]<0.7.7,>=0.7.5->ydata-profiling) (0.7.6) Requirement already satisfied: numpy<2.2,>=1.16.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (1.26.4) Requirement already satisfied: htmlmin==0.1.12 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (0.1.12) Requirement already satisfied: phik<0.13,>=0.11.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (0.12.4) Requirement already satisfied: requests<3,>=2.24.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (2.32.3) Requirement already satisfied: tqdm<5,>=4.48.2 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (4.66.5) Requirement already satisfied: seaborn<0.14,>=0.10.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (0.13.2) Requirement already satisfied: multimethod<2,>=1.4 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (1.12) Requirement already satisfied: statsmodels<1,>=0.13.2 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (0.14.2) Requirement already satisfied: typeguard<5,>=3 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (4.4.1) Requirement already satisfied: imagehash==4.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (4.3.1) Requirement already satisfied: wordcloud>=1.9.3 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (1.9.4) Requirement already satisfied: dacite>=1.8 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (1.8.1) Requirement already satisfied: numba<1,>=0.56.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata-profiling) (0.60.0) Requirement already satisfied: PyWavelets in /opt/anaconda3/lib/python3.12/site-packages (from imagehash==4.3.1->ydata-profiling) (1.7.0) Requirement already satisfied: pillow in /opt/anaconda3/lib/python3.12/site-packages (from imagehash==4.3.1->ydata-profiling) (10.4.0) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.12/site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling) (2.1.3) Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata-profiling) (1.2.0) Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata-profiling) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata-profiling) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata-profiling) (1.4.4) Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata-profiling) (24.1) Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata-profiling) (3.1.2) Requirement already satisfied: python-dateutil>=2.7 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata-profiling) (2.9.0.post0) Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /opt/anaconda3/lib/python3.12/site-packages (from numba<1,>=0.56.0->ydata-profiling) (0.43.0) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas!=1.4.0,<3,>1.1->ydata-profiling) (2024.1) Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas!=1.4.0,<3,>1.1->ydata-profiling) (2023.3) Requirement already satisfied: joblib>=0.14.1 in /opt/anaconda3/lib/python3.12/site-packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.4.2) Requirement already satisfied: annotated-types>=0.4.0 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic>=2->ydata-profiling) (0.6.0) Requirement already satisfied: pydantic-core==2.20.1 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic>=2->ydata-profiling) (2.20.1) Requirement already satisfied: typing-extensions>=4.6.1 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic>=2->ydata-profiling) (4.11.0) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata-profiling) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata-profiling) (2024.8.30) Requirement already satisfied: patsy>=0.5.6 in /opt/anaconda3/lib/python3.12/site-packages (from statsmodels<1,>=0.13.2->ydata-profiling) (0.5.6) Requirement already satisfied: attrs>=19.3.0 in /opt/anaconda3/lib/python3.12/site-packages (from visions<0.7.7,>=0.7.5->visions[type_image_path]<0.7.7,>=0.7.5->ydata-profiling) (23.1.0) Requirement already satisfied: networkx>=2.4 in /opt/anaconda3/lib/python3.12/site-packages (from visions<0.7.7,>=0.7.5->visions[type_image_path]<0.7.7,>=0.7.5->ydata-profiling) (3.3) Requirement already satisfied: six in /opt/anaconda3/lib/python3.12/site-packages (from patsy>=0.5.6->statsmodels<1,>=0.13.2->ydata-profiling) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [3]:
import numpy as np
import pandas as pd
In [4]:
import pandas as pd
import sqlite3
# File path
csv_file = "/Users/skgokulkumar/Documents/work/Data/diabetes.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(csv_file)
# Connect to an SQLite database
conn = sqlite3.connect("diabetes.db")
cursor = conn.cursor()
# Drop tables if they already exist
cursor.execute("DROP TABLE IF EXISTS Patient_Data")
cursor.execute("DROP TABLE IF EXISTS Medical_Records")
# Normalize the data into separate tables
# Create a table for patient demographics
cursor.execute("""
CREATE TABLE IF NOT EXISTS Patient_Data (
PatientID INTEGER PRIMARY KEY AUTOINCREMENT,
Age INTEGER,
Outcome INTEGER
)
""")
# Create a table for medical measurements
cursor.execute("""
CREATE TABLE IF NOT EXISTS Medical_Records (
Medical_Records_ID INTEGER PRIMARY KEY AUTOINCREMENT,
PatientID INTEGER,
Pregnancies INTEGER,
Glucose INTEGER,
BloodPressure INTEGER,
SkinThickness INTEGER,
Insulin INTEGER,
BMI FLOAT,
DiabetesPedigreeFunction FLOAT,
FOREIGN KEY (PatientID) REFERENCES Patient_Data (PatientID)
)
""")
# Insert data into the tables
for _, row in df.iterrows():
# Insert into Patient_Data
cursor.execute("""
INSERT INTO Patient_Data (Age, Outcome)
VALUES (?, ?)
""", (row['Age'], row['Outcome']))
patient_id = cursor.lastrowid
# Insert into Medical_Records
cursor.execute("""
INSERT INTO Medical_Records (
PatientID, Pregnancies, Glucose, BloodPressure,
SkinThickness, Insulin, BMI, DiabetesPedigreeFunction
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
patient_id, row['Pregnancies'], row['Glucose'],
row['BloodPressure'], row['SkinThickness'], row['Insulin'],
row['BMI'], row['DiabetesPedigreeFunction']
))
# Commit the changes
conn.commit()
# Write an SQL query with joins to reconstruct the original data
query = """
SELECT
p.PatientID,
m.Pregnancies,
m.Glucose,
m.BloodPressure,
m.SkinThickness,
m.Insulin,
m.BMI,
m.DiabetesPedigreeFunction,
p.Age,
p.Outcome
FROM
Patient_Data p
JOIN
Medical_Records m
ON
p.PatientID = m.PatientID
"""
# Execute the query and load the result into a Pandas DataFrame
reconstructed_df = pd.read_sql_query(query, conn)
# Close the database connection
conn.close()
# Display the reconstructed DataFrame
print(reconstructed_df)
PatientID Pregnancies Glucose BloodPressure SkinThickness Insulin \
0 1 6 148 72 35 0
1 2 1 85 66 29 0
2 3 8 183 64 0 0
3 4 1 89 66 23 94
4 5 0 137 40 35 168
.. ... ... ... ... ... ...
763 764 10 101 76 48 180
764 765 2 122 70 27 0
765 766 5 121 72 23 112
766 767 1 126 60 0 0
767 768 1 93 70 31 0
BMI DiabetesPedigreeFunction Age Outcome
0 33.6 0.627 50 1
1 26.6 0.351 31 0
2 23.3 0.672 32 1
3 28.1 0.167 21 0
4 43.1 2.288 33 1
.. ... ... ... ...
763 32.9 0.171 63 0
764 36.8 0.340 27 0
765 26.2 0.245 30 0
766 30.1 0.349 47 1
767 30.4 0.315 23 0
[768 rows x 10 columns]
In [5]:
#EDA
In [6]:
import pandas as pd
import sqlite3
# Connect to the SQLite database
conn = sqlite3.connect("diabetes.db")
# Write an SQL query to fetch data from the tables
query = """
SELECT
p.PatientID,
m.Pregnancies,
m.Glucose,
m.BloodPressure,
m.SkinThickness,
m.Insulin,
m.BMI,
m.DiabetesPedigreeFunction,
p.Age,
p.Outcome
FROM
Patient_Data p
JOIN
Medical_Records m
ON
p.PatientID = m.PatientID
"""
# Execute the query and load the result into a Pandas DataFrame
data = pd.read_sql_query(query, conn)
# Close the database connection
conn.close()
# Display the first few rows to ensure it loaded correctly
print("First 5 rows of the dataset:")
print(data.head())
# Display dataset information (column names, data types, etc.)
print("\nDataset information:")
print(data.info())
First 5 rows of the dataset:
PatientID Pregnancies Glucose BloodPressure SkinThickness Insulin \
0 1 6 148 72 35 0
1 2 1 85 66 29 0
2 3 8 183 64 0 0
3 4 1 89 66 23 94
4 5 0 137 40 35 168
BMI DiabetesPedigreeFunction Age Outcome
0 33.6 0.627 50 1
1 26.6 0.351 31 0
2 23.3 0.672 32 1
3 28.1 0.167 21 0
4 43.1 2.288 33 1
Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PatientID 768 non-null int64
1 Pregnancies 768 non-null int64
2 Glucose 768 non-null int64
3 BloodPressure 768 non-null int64
4 SkinThickness 768 non-null int64
5 Insulin 768 non-null int64
6 BMI 768 non-null float64
7 DiabetesPedigreeFunction 768 non-null float64
8 Age 768 non-null int64
9 Outcome 768 non-null int64
dtypes: float64(2), int64(8)
memory usage: 60.1 KB
None
In [7]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report")
In [8]:
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]